#!/usr/bin/env python
# coding: utf-8

# In[1]:


exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')


# # AZure Version 

# In[2]:


# Download Data
# cd /Users/Titanic/Documents/Data/GasStation
# git clone https://dev.azure.com/tankerkoenig/_git/tankerkoenig-data
# Unzip Data
# unzip tankerkoenig-data.zip -d tankerkoenig-data/


# In[3]:


# Prices: basic cleanup & convert csv to h5 into HDFSplit
files = [f for f in list((P_GS_Data_Raw / 'tankerkoenig-data' / 'prices').rglob('*')) if f.is_file()]
files = sorted(files, key=lambda f: int(''.join(f.name.split('/')[-1].split('-')[:3])))


# In[5]:


P_MAX = 4000; P_MIN = 300


# In[6]:


# Very Basic Cleanup: Col Names, Bad Data, Convert to HDF
def clean_up_raw_file(f):
    # f = files[0]
    ymd = ''.join(f.name.split('/')[-1].split('-')[:3])
    df = read_csv(f)
    df = df.rename(columns={'date':'Time','station_uuid':'ID'})
    for col in ['diesel','e5','e10']:
        df[col] = (df[col]*1000).round().astype(int)
        df.loc[(df[col]>=P_MAX)|(df[col]<=P_MIN), col] = -1
    df['Time'] = to_datetime(df.Time.astype(str).str[:-6])
    df.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / (ymd + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')

with Pool() as p:
    ls = p.map(clean_up_raw_file, files)


# In[6]:


# # Serial Version
# for f in files:
#     # f = files[0]
#     ymd = ''.join(f.name.split('/')[-1].split('-')[:3])
#     if int(ymd)%100==1:
#         print(int(ymd)//100, end=',')
#     df = read_csv(f)
#     df = df.rename(columns={'date':'Time','station_uuid':'ID'})
#     for col in ['diesel','e5','e10']:
#         df[col] = (df[col]*1000).round().astype(int)
#         df.loc[(df[col]>=P_MAX)|(df[col]<=P_MIN), col] = -1
#     df['Time'] = to_datetime(df.Time.astype(str).str[:-6])
#     df.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / (ymd + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# In[7]:


# clean up Gas Station
files = list((P_GS_Data_Raw / 'tankerkoenig-data' / 'stations').glob('**/*.csv'))
cols_gs_original = ['uuid', 'name', 'brand', 'street', 'house_number', 'post_code', 'city', 'latitude', 'longitude']
df = concat([read_csv(f)[cols_gs_original] for f in files])
df.columns = ['ID','Name','Brand','St','StNum','Post','City','Lat','Lng']
df = df.drop_duplicates()
df.head(2)


# In[8]:


# df = clean_gas_station(df)
df.to_hdf(P_GS_Data_Raw / 'gas_station_raw_azure.h5', 'GS', mode='w', complevel=9, complib='blosc')


# In[9]:


# # Delete Raw Data
# import shutil
# shutil.rmtree(P_GS_Data_Raw / 'tankerkoenig-data' / 'prices')


# In[6]:


# then go to Data_Gas_Station







# # Data Dump Version (Not Used Any More)

# In[19]:


part = 1
# part = 2

# In[21]:


if part == 1:
    f_dump_gz = P_GS_Data_Raw / 'history_dump_20140608_20160502.gz'
elif part == 2:
    f_dump_gz = P_GS_Data_Raw / 'history_dump_20160502_now.gz'
f_dump = P_GS_Data_Raw / ('history_dump_' + str(part))


# ### Split the doc into many small chunks

# In[22]:


# Extract Dumpy File
with gzip.open(f_dump_gz, 'rb') as f_in:
    with open(f_dump, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


# In[23]:


from itertools import chain, islice

def chunks(iterable, n):
    "chunks(ABCDE,2) => AB CD E"
    iterable = iter(iterable)
    while True:
        try:
            yield chain([next(iterable)], islice(iterable, n-1))
        except StopIteration:
            return

n_rows = 1000000 # number of rows in each chunk


# In[24]:


with open(f_dump) as bigfile:
    for i, lines in enumerate(chunks(bigfile, n_rows)):
        print(i, end='-')
        file_split = P_GS_Data_Raw / 'SplitRaw' / ('dump_' + str(part) + '_' + str(i))
        with open(file_split, 'w') as f:
            f.writelines(lines)
        break


# In[25]:


os.remove(f_dump); # dump file


# ### Read Each Chunk & Cleanup

# In[26]:


cols_gas_station = ['ID','Ver','VerTime','Name','Brand','St','StNum','Post','City','Holiday','Lat','Lng']
cols_price_history = ['index','ID','e5','e10','diesel','Time','Changed']


# In[27]:


files = (P_GS_Data_Raw / 'SplitRaw').glob('dump_*')
files = sorted(files, key=lambda x: int(x.name.split('_')[-1]))
# files = files[:5]
files[:2]


# #### Chunk 1

# In[28]:


# Save table schemes & save two tables
df = read_csv(files[0], sep='\t', names=cols_gas_station)


# In[29]:


from itertools import *
from operator import itemgetter
df['nNulls'] = df.isnull().sum(axis=1) # Find Rows With Many Nulls
data = df[df.nNulls>=10].index.values # Separate the Rows
ls_rows_scheme = [list(map(itemgetter(1), g)) for k, g in groupby(enumerate(data), lambda x: x[0]-x[1])]

# Save Scheme 1: gas station
with open(P_GS_Data / ('dump_scheme_gas_station_' + str(part) + '.txt'), 'w') as f_scheme:
    f_scheme.write('\n'.join(df.loc[ls_rows_scheme[0], 'ID'].values))

# Save Scheme 2: price history
with open(P_GS_Data / ('dump_scheme_price_history_' + str(part) + '.txt'), 'w') as f_scheme:
    f_scheme.write('\n'.join(df.loc[ls_rows_scheme[1], 'ID'].values))


# In[31]:


df_gas_station = df.loc[(ls_rows_scheme[0][-1]+1):(ls_rows_scheme[1][0]-1)].iloc[:,:-1]
df_gas_station = clean_gas_station(df_gas_station)
df_gas_station.to_hdf(P_GS_Data_Raw / ('gas_station_raw_' + str(part) + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')

df_price_history = df.loc[ls_rows_scheme[1][-1]+1:].iloc[:,:7]
df_price_history.columns = cols_price_history
df_price_history = clean_price_history(df_price_history)
df_price_history.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / ('price_history_' + str(part) + '_0.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# #### Other Chunks

# In[37]:


for f in files[1:-1]:
    i = f.name.split('_')[-1]
    print(i, end='-')
    df = read_csv(f, sep='\t', names=cols_price_history) # Read Raw File
    df = clean_price_history(df) # Clean up
    df.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / ('price_history_' + str(part) + '_' + i + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# In[38]:


# Last Chunk: tail is a bit messy
i = files[-1].name.split('_')[-1]
print(i, end='-')
df = read_csv(f, sep='\t', names=cols_price_history) # Read Raw File
df['nNulls'] = df.isnull().sum(axis=1) # Find Rows With Many Nulls
df = df[df.nNulls<=2]; del df['nNulls']
df = clean_price_history(df) # Clean up
df.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / ('price_history_' + str(part) + '_'  + i + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# ### Delete Temporary Files

# In[39]:


[f.unlink() for f in files]; # chunk files




# part = 1
part = 2

# In[21]:


if part == 1:
    f_dump_gz = P_GS_Data_Raw / 'history_dump_20140608_20160502.gz'
elif part == 2:
    f_dump_gz = P_GS_Data_Raw / 'history_dump_20160502_now.gz'
f_dump = P_GS_Data_Raw / ('history_dump_' + str(part))


# ### Split the doc into many small chunks

# In[22]:


# Extract Dumpy File
with gzip.open(f_dump_gz, 'rb') as f_in:
    with open(f_dump, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


# In[23]:


from itertools import chain, islice

def chunks(iterable, n):
    "chunks(ABCDE,2) => AB CD E"
    iterable = iter(iterable)
    while True:
        try:
            yield chain([next(iterable)], islice(iterable, n-1))
        except StopIteration:
            return

n_rows = 1000000 # number of rows in each chunk


# In[24]:


with open(f_dump) as bigfile:
    for i, lines in enumerate(chunks(bigfile, n_rows)):
        print(i, end='-')
        file_split = P_GS_Data_Raw / 'SplitRaw' / ('dump_' + str(part) + '_' + str(i))
        with open(file_split, 'w') as f:
            f.writelines(lines)
        break


# In[25]:


os.remove(f_dump); # dump file


# ### Read Each Chunk & Cleanup

# In[26]:


cols_gas_station = ['ID','Ver','VerTime','Name','Brand','St','StNum','Post','City','Holiday','Lat','Lng']
cols_price_history = ['index','ID','e5','e10','diesel','Time','Changed']


# In[27]:


files = (P_GS_Data_Raw / 'SplitRaw').glob('dump_*')
files = sorted(files, key=lambda x: int(x.name.split('_')[-1]))
# files = files[:5]
files[:2]


# #### Chunk 1

# In[28]:


# Table 1 is Messy, it includes two tables: Gas Station & Price History
# Save table schemes & save two tables
df = read_csv(files[0], sep='\t', names=cols_gas_station)


# In[29]:


from itertools import *
from operator import itemgetter
df['nNulls'] = df.isnull().sum(axis=1) # Find Rows With Many Nulls
data = df[df.nNulls>=10].index.values # Separate the Rows
ls_rows_scheme = [list(map(itemgetter(1), g)) for k, g in groupby(enumerate(data), lambda x: x[0]-x[1])]

# Save Scheme 1: gas station
with open(P_GS_Data / ('dump_scheme_gas_station_' + str(part) + '.txt'), 'w') as f_scheme:
    f_scheme.write('\n'.join(df.loc[ls_rows_scheme[0], 'ID'].values))

# Save Scheme 2: price history
with open(P_GS_Data / ('dump_scheme_price_history_' + str(part) + '.txt'), 'w') as f_scheme:
    f_scheme.write('\n'.join(df.loc[ls_rows_scheme[1], 'ID'].values))


# In[31]:


df_gas_station = df.loc[(ls_rows_scheme[0][-1]+1):(ls_rows_scheme[1][0]-1)].iloc[:,:-1]
df_gas_station = clean_gas_station(df_gas_station)
df_gas_station.to_hdf(P_GS_Data_Raw / ('gas_station_raw_' + str(part) + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')

df_price_history = df.loc[ls_rows_scheme[1][-1]+1:].iloc[:,:7]
df_price_history.columns = cols_price_history
df_price_history = clean_price_history(df_price_history)
df_price_history.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / ('price_history_' + str(part) + '_0.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# #### Other Chunks

# In[37]:


for f in files[1:-1]:
    i = f.name.split('_')[-1]
    print(i, end='-')
    df = read_csv(f, sep='\t', names=cols_price_history) # Read Raw File
    df = clean_price_history(df) # Clean up
    df.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / ('price_history_' + str(part) + '_' + i + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# In[38]:


# Last Chunk: tail is a bit messy
i = files[-1].name.split('_')[-1]
print(i, end='-')
df = read_csv(f, sep='\t', names=cols_price_history) # Read Raw File
df['nNulls'] = df.isnull().sum(axis=1) # Find Rows With Many Nulls
df = df[df.nNulls<=2]; del df['nNulls']
df = clean_price_history(df) # Clean up
df.to_hdf(P_GS_Data_Raw / 'Raw_HDF' / ('price_history_' + str(part) + '_'  + i + '.h5'), 'GS', mode='w', complevel=9, complib='blosc')


# ### Delete Temporary Files

# In[39]:


[f.unlink() for f in files]; # chunk files

# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:




